# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# Installing Packages:
# BiocManager::install("oligo")
# BiocManager::install("limma")
# BiocManager::install("biomaRt")
# BiocManager::install("AnnotationDbi")
# BiocManager::install("Biobase")
# Importing Libraries
# library(oligo)
# library(limma)
# library(biomaRt)
# library(dplyr)
# library(tibble)
# library(methods)
# library(AnnotationDbi)
# library(Biobase)
#Reading in runsheet with metadata
csv_file <- read.table(file = r"(C:\Users\linde\rmarkdown\affy_processing\data_runsheets\GLDS_6_runsheet.csv)",
sep=",",
header = TRUE,
check.names = FALSE
)
#Accessing arrray data files located in column name 'Path to Raw Data File'
array_data_files <- csv_file["Path to Raw Data File"]
#Importing and read *.CEL files using oligo function 'read.celfiles()'
raw_data <- oligo::read.celfiles(array_data_files)
## Platform design info loaded.
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_CTRL_Rep1_GSM144733_raw1.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_CTRL_Rep2_GSM144734_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_CTRL_Rep3_GSM144735_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_IR_56Fe_Rep1_GSM144736_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_IR_56Fe_Rep2_GSM144737_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_IR_56Fe_Rep3_GSM144738_raw.CEL.gz
#'raw_data' type - ExpressionFeatureSet Class
# Parameter Definitions:
#file – path of runsheet file to be accessed
#sep - setting delimiter, default is white space
#header - determines if first row in dataframe is column names
#check.names - if TRUE, variable names are checked and corrected for synaticially validity
#Assess quality of raw data through visualization of plots
# First plot: density plot
# hist() function plots the density estimates for each sample
# Purpose: Determine technical variation between samples
density_plot <- hist(raw_data, transfo=log2, which=c("pm", "mm", "bg", "both", "all"),
nsample=10000, target = "core", main = "Density of raw intensities for multiple arrays")
## TODO: Add sample legend
# Second Plot: Pseudoimage
# image() function produces a pseudoimage for each array
# Purpose: Identify spatial abnormalities
pseudoimage <- image(raw_data, transfo=log2)
# Third Plot: MA plot
# MAplot() function creates MA plots using a reference array
# Purpose: Comparison of intensity values from one sample to another
MA_plot <- MAplot(raw_data, ylim=c(-2, 4))
# Parameter Definitions:
# transfo - used to scale the data
# which - defines specific probe types
# nsample - sample size used to produce plot
# target - specifies group of meta-probeset
# main - main title
# ylim - scale for y axis
# The IQRray statistic is obtained by ranking all the probe intensities from
# a given array and by computing the average rank for each probe set. The
# interquartile range (IQR) of the probe sets average ranks serves then as
# quality score.
#data - ExpressionFeatureSet object obtained after reading in celfiles into R
#obtaining intensity values for perfect match (pm) probes
pm_data<-pm(raw_data)
#ranking probe intensities for every array
rank_data<-apply(pm_data,2,rank)
#obtaining names of probeset for every probe
probeNames<-probeNames(raw_data)
#function computing IQR of mean probe ranks in probesets
get_IQR<-function(rank_data,probeNames){round(IQR(sapply(split(rank_data,probeNames),mean)),digits=2)}
#computing arIQR score
IQRray_score<-apply(rank_data,2,get_IQR,probeNames=probeNames)
#Probe Level normalization
#Converting raw data from ExpressionFeatureSet object to matrix
raw_table <- exprs(raw_data)
#Performing background correction
backgroundCorrection <- preprocessCore::rma.background.correct(raw_table)
#Performing Normalization using Quantile Normalization
probelevel_table <- preprocessCore::normalize.quantiles(backgroundCorrection, keep.names=TRUE)
#Writing probe level normalization to file
#write.csv(probelevel_table, r"(C:\Users\linde\rmarkdown\notebook\probelevel_table.csv)")
#Probeset Level Normalization
#RMA (Robust Multi-array Average) is most commonly used method for preprocessing normalization of
#Affymetrix microarray data. There are three steps in the RMA algorithm: Convolution Background
#Correction, Quantile Normalization, and Summarization using Median Polish.
#Performing RMA
normRMA <- rma(raw_data)
## Background correcting
## Normalizing
## Calculating Expression
#Converting normalized data to a matrix for easier accessibility
norm_probeset_table <- exprs(normRMA)
#Writing probeset level normalization to file
#write.csv(norm_probeset_table, r"(C:\Users\linde\rmarkdown\notebook\norm_probeset_table.csv)")
#QA to ensure normalization worked properly
#Samples should appear more similar to each other after normalization
#Same types of plots as QA of raw data for easy comparison
MAplot_norm <- MAplot(normRMA, ylim=c(-2, 4))
densityPlot_norm <- hist(normRMA, main = "Density of raw intensities for multiple arrays")
## Getting Factor Values
#Creating a list of Factor Values
fv <- data.frame(csv_file["Factor Value"])
all_factor_values <- fv[,1]
#Obtaining the name of each factor value
factor_values <- unique(all_factor_values)
#Making the names for both lists syntactically valid
all.factor.values <- make.names(all_factor_values)
factor.values <- make.names(factor_values)
## Creating Design Matrix
#Adding factor values to phenoData
ph = raw_data@phenoData
ph@data[ ,2] = c(all.factor.values)
colnames(ph@data)[2]="source"
#Grouping factor values
groups = ph@data$source
#Transforming group names into statistical factors
f = factor(groups,levels = factor.values)
#Creating matrix of values of the grouping variable
design = model.matrix(~ 0 + f)
colnames(design) <- factor.values
## Forming Constrast Matrix
#Fit design matrix to a linear model using lmFit()
fit = limma::lmFit(norm_probeset_table,design)
fit.groups <- colnames(fit$design) [which(fit$assign == 1)]
fit.index <- which(levels(levels) %in% fit.groups)
#Creating comparisons for contrast matrix
fit.group.names <- gsub(" ", "_", sub(",", "_", unique(csv_file$'Factor Value')))
combos <- combn(fit.groups, 2)
combos.names <- combn(fit.group.names, 2)
contrasts <- c(paste(combos[1,], combos[2,], sep = "-", paste(combos[2,], combos[1,], sep="-")))
contrast.names <- c(paste(combos.names[1,], combos.names[2,], sep = "v"), paste(combos.names[2,],
combos.names[1,], sep = "v"))
#Creating Contrast Matrix
cont.matrix <- limma::makeContrasts(contrasts = contrasts, levels=design)
contrast.fit <- limma::contrasts.fit(fit, cont.matrix)
## Testing and Accessing Results
#Performing statistical t-test using eBayes()
contrast.fit.eb <- limma::eBayes(contrast.fit)
#Extracting log fold change values
log_fold_change <- contrast.fit.eb$coefficients
#Extracting p-values
p_value <- contrast.fit.eb$p.value
#Changing names
log.fold.change <- c(log_fold_change)
p.value <- c(p_value)
#Adjusting the p-values using the Benjamini-Hochberg methods
adjusted_pvalue <- p.adjust(p_value,method="BH")
#Combining normalized expressions table with test results
de_dataframe <- cbind(norm_probeset_table, log.fold.change, p.value, adjusted_pvalue)
##TODO debug decideTests and summary functions
##This part is only for averaging the expression values for each factor value group
# for glds 6
# TODO: figure out how to separate factor values automatically
non_irradiated <- norm_probeset_table[,c(1,2,3)]
ion_beam_radiation <- norm_probeset_table[,c(4,5,6)]
avg_fv_6 <- data.frame(rowMeans(non_irradiated), rowMeans(ion_beam_radiation))
de_dataframe <- cbind(norm_probeset_table, log.fold.change, p.value, adjusted_pvalue, avg_fv_6)
## TODO: make dataframe for glds 189
## GLDS 6
#Searching Ensembl
listEnsembl()
## biomart version
## 1 genes Ensembl Genes 107
## 2 mouse_strains Mouse strains 107
## 3 snps Ensembl Variation 107
## 4 regulation Ensembl Regulation 107
#Selecting mart object pointing to the database
ensembl <- useEnsembl(biomart = "genes")
#Searching datasets in this database
searchDatasets(mart = ensembl, pattern = "Rat")
## dataset description version
## 167 rnorvegicus_gene_ensembl Rat genes (mRatBN7.2) mRatBN7.2
#Selecting dataset
ensembl <- useDataset(dataset = "rnorvegicus_gene_ensembl", mart = ensembl)
#Finding the microarray probe ID attribute
searchAttributes(mart = ensembl, pattern = "affy_")
## name description page
## 91 affy_rae230a AFFY RAE230A probe feature_page
## 92 affy_rae230b AFFY RAE230B probe feature_page
## 93 affy_raex_1_0_st_v1 AFFY RaEx 1 0 st v1 probe feature_page
## 94 affy_ragene_1_0_st_v1 AFFY RaGene 1 0 st v1 probe feature_page
## 95 affy_ragene_2_1_st_v1 AFFY RaGene 2 1 st v1 probe feature_page
## 96 affy_rat230_2 AFFY Rat230 2 probe feature_page
## 97 affy_rg_u34a AFFY RG U34A probe feature_page
## 98 affy_rg_u34b AFFY RG U34B probe feature_page
## 99 affy_rg_u34c AFFY RG U34C probe feature_page
## 100 affy_rn_u34 AFFY RN U34 probe feature_page
## 101 affy_rt_u34 AFFY RT U34 probe feature_page
#Generate mapped dataframe using getBM()
my_affy_ids <- rownames(norm_probeset_table)
gene_annotations <- getBM(
attributes = c(
"affy_rat230_2",
"ensembl_gene_id",
"uniprot_gn_symbol",
"go_id"
),
filters = "affy_rat230_2",
values = my_affy_ids,
mart = ensembl)
# Parameter Definitions:
#attributes - desired attributes extracted from specified database
#filters - filters to be used in the query
#values - query IDs
#mart - object connected to desird database and dataset
#Converting multimappings from multiple rows into list fields
gene_annotations <- gene_annotations %>%
#grouping by probe ID
group_by(affy_rat230_2) %>%
summarise(
#Convert uniprot_gn_symbol to a string of unique values in the group
SYMBOL = toString(unique(uniprot_gn_symbol)),
#Convert Ensembl IDs to a string of unique IDs in the group
ENSEMBL = toString(unique(ensembl_gene_id)),
#Convert go_ids to a string of unique IDs in the group
GO_Ids = toString(unique(go_id))
)
#glds6
#Joining tables to map to items in DE dataframe using left.join()
#Creating a column with probe IDS in DE dataframe
final_dataframe <- rownames_to_column(de_dataframe, var="probesets") %>% left_join(gene_annotations, by = c("probesets" = "affy_rat230_2"))
#Joining using probe IDS as common factor
## Writing Final table to file
#write.csv(final_dataframe, r"(C:\Users\linde\rmarkdown\notebook\GLDS_6_DGE.csv)")
#write.csv(final_dataframe, r"(C:\Users\linde\rmarkdown\notebook\GLDS_25_DGE.csv)")